In [1]:
# Render our plots inline
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np # we'll need this for sqrt and mean
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)
In [2]:
quality = pd.read_csv("quality.csv")
quality.head()
Out[2]:
In [3]:
quality.groupby('PoorCare').count()
Out[3]:
The baseline model has the accuracy of
In [4]:
98/(98+33)
Out[4]:
We need to randomly split our data into the test and train set. For now, I'll use python's random number generator. Ideally, we want the same rows as in R, but I'll only implement this when I know how to run R from Python.
In [5]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(quality, train_size=0.75, random_state=1)
qualityTrain = pd.DataFrame(train, columns=quality.columns)
qualityTest = pd.DataFrame(test, columns=quality.columns)
In [6]:
qualityTrain['PoorCare'] = qualityTrain['PoorCare'].astype(int)
In [7]:
import statsmodels.api as sm
qualityTrain
cols = ['OfficeVisits', 'Narcotics']
x = qualityTrain[cols]
x = sm.add_constant(x)
y = qualityTrain['PoorCare']
model = sm.Logit(y, x.astype(float)).fit()
model.summary()
Out[7]:
In [8]:
x = qualityTest[cols]
x = sm.add_constant(x)
predTest = model.predict(x.astype(float))
In [9]:
x = qualityTrain[cols]
x = sm.add_constant(x)
predTrain = model.predict(x.astype(float))
In [10]:
pd.crosstab(qualityTrain['PoorCare'], predTrain > 0.2, colnames=['PredictedCare'])
Out[10]:
In [11]:
sens = 14/(14+8)
spec = 60/(60+16)
print(sens)
print(spec)
Increase the threshold to 0.7:
In [12]:
pd.crosstab(qualityTrain['PoorCare'], predTrain > 0.7, colnames=['PredictedCare'])
Out[12]:
In [13]:
sens = 7/(7+8)
spec = 75/(75+1)
print(sens)
print(spec)
Python has a way to do this painlessly, horray! Docs: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
In [14]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(qualityTrain['PoorCare'], predTrain)
In [15]:
fpr
Out[15]:
In [16]:
thresholds
Out[16]:
In [17]:
tpr
Out[17]:
In [18]:
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(fpr, tpr)
ax1.set_xlabel("False Positive Rate")
ax1.set_ylabel("True Positive Rate")
Out[18]:
In [21]:
auc = metrics.roc_auc_score(qualityTrain['PoorCare'], predTrain)
print(auc)
In [ ]: